Individual Claims Regression

library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::combine()   masks gridExtra::combine()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
## 
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     last_plot
## L'objet suivant est masqué depuis 'package:stats':
## 
##     filter
## L'objet suivant est masqué depuis 'package:graphics':
## 
##     layout
library(knitr)
library(DT)
library(pals)
library(splines)
library(MASS)
## 
## Attachement du package : 'MASS'
## L'objet suivant est masqué depuis 'package:plotly':
## 
##     select
## L'objet suivant est masqué depuis 'package:dplyr':
## 
##     select

Loading Data

df_claims = read.csv("14OURE-PG_2017_CLAIMS_YEAR0.csv")
df_year0 = read.csv("14OURE-PG_2017_YEAR0.csv")
df_year1 = read.csv("14OURE-PG_2017_YEAR1.csv")


df_regions = read.csv("departements_francais.csv", sep = ";") %>% dplyr::select(NUMERO, REGION, "DENSITE..habitants.km2.")

df_year0 = df_year0 %>% mutate(NUMERO = substr(df_year0$pol_insee_code,1,2))
df_year0 = left_join(df_year0, df_regions, by = "NUMERO" )

df_year0$DENSITE..habitants.km2. <- as.numeric(gsub(",", ".", df_year0$DENSITE..habitants.km2.)) #convert density to numeric
names(df_year0)[names(df_year0)=="DENSITE..habitants.km2."] <- 'density'
df_claims_0 = df_claims[(df_claims$claim_amount >= 0),]

# create final merged df
df = left_join(df_year0, df_claims_0, by = c("id_policy")) %>%
  mutate(claim_nb = ifelse(is.na(claim_nb), 0, claim_nb), claim_amount=ifelse(is.na(claim_amount), 0, claim_amount)) 

df <-  df[df$claim_amount > 0,] #keep individuals claims for modelisation

Categorize variables

df$drv_age1G <- cut(df$drv_age1, c(17, 2:8*10, max(df$drv_age1)))
df$drv_age2G <- cut(df$drv_age1, c(17, 25, 45,max(df$drv_age1)))

df$vh_ageG <- cut(df$vh_age, c(0, 10, 25, 30, 100))#no vehicle with age 0, min value is 1 in the dataset
df$vh_valueG = cut(df$vh_value, c(min(df$vh_value), 8000, 15000, 22000, max(df$vh_value)), include.lowest = TRUE)
df$vh_dinG = cut(df$vh_din, c(min(df$vh_din), 50, 220, max(df$vh_din)), include.lowest = TRUE)
df$vh_makeG = ifelse(df$vh_make %in% c("RENAULT", "NISSAN", "CITROEN"), "A", 
                              ifelse(df$vh_make %in% c("VOLKSWAGEN", "AUDI", "SKODA", "SEAT"), "B",
                              ifelse(df$vh_make %in% c("OPEL", "GENERAL MOTORS", "FORD"), "C",
                              ifelse(df$vh_make %in% c("FIAT"), "D",
                              ifelse(df$vh_make %in% c("MERCEDES BENZ", "BMW", "CITROEN"), "E", "G")))))


df$densityG<- cut(df$density,c(0,40,200,500,4500,Inf),
include.lowest = TRUE)

df$pol_bonusG = cut(df$pol_bonus, c(0.5, 0.9, 1.2, 1.4, max(df$pol_bonus)), include.lowest = TRUE)
df$pol_durationG = cut(df$pol_duration, c(min(df$pol_duration),10, 20,25,30, max(df$pol_duration)), include.lowest = TRUE)

df$pol_coverageG = ifelse(df$pol_coverage %in% c("Median1", "Median2"), "Median", df$pol_coverage)

var <-c("drv_age1","drv_age1G","drv_age2G" ,"vh_ageG", "vh_age" , "vh_valueG", "vh_value" ,"vh_din", "vh_dinG","vh_type", "vh_fuel", "vh_makeG", "densityG","density","pol_bonus", "pol_bonusG" ,"pol_durationG", "pol_coverageG", "claim_amount") 
#keep features we worked on andh the original for some continous ones
df <- df[var]

Claims regression

formula = log(claim_amount)~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel
reg.logn <- lm(formula ,data=df)#[df$claim_amount<15000,]

summary(reg.logn)
## 
## Call:
## lm(formula = formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2603 -1.0583  0.0478  1.1342  6.7410 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.470823   0.221437  24.706  < 2e-16 ***
## drv_age2G(25,45]            -0.233306   0.124101  -1.880  0.06013 .  
## drv_age2G(45,103]           -0.083907   0.123786  -0.678  0.49789    
## vh_ageG(10,25]              -0.138167   0.033068  -4.178 2.96e-05 ***
## vh_ageG(25,30]               0.525206   0.220643   2.380  0.01731 *  
## vh_ageG(30,100]             -0.043645   0.309494  -0.141  0.88786    
## vh_valueG(8e+03,1.5e+04]    -0.074437   0.176655  -0.421  0.67349    
## vh_valueG(1.5e+04,2.2e+04]   0.061569   0.178986   0.344  0.73086    
## vh_valueG(2.2e+04,1.32e+05]  0.099337   0.180214   0.551  0.58150    
## vh_dinG(50,220]              0.725777   0.161174   4.503 6.76e-06 ***
## vh_dinG(220,507]             1.240460   0.207559   5.976 2.34e-09 ***
## vh_makeGB                   -0.007619   0.051079  -0.149  0.88143    
## vh_makeGC                    0.068735   0.053510   1.285  0.19898    
## vh_makeGD                    0.125928   0.089159   1.412  0.15786    
## vh_makeGE                    0.155744   0.067493   2.308  0.02104 *  
## vh_makeGG                    0.082177   0.032518   2.527  0.01151 *  
## densityG(40,200]             0.051032   0.064408   0.792  0.42819    
## densityG(200,500]            0.129857   0.069013   1.882  0.05991 .  
## densityG(500,4.5e+03]       -0.008089   0.078408  -0.103  0.91783    
## densityG(4.5e+03,Inf]        0.130164   0.077793   1.673  0.09431 .  
## pol_bonusG(0.9,1.2]          0.208619   0.107375   1.943  0.05205 .  
## pol_bonusG(1.2,1.4]          0.821494   0.600486   1.368  0.17132    
## pol_bonusG(1.4,1.56]         0.833189   0.791647   1.052  0.29260    
## pol_durationG(10,20]         0.070193   0.033221   2.113  0.03463 *  
## pol_durationG(20,25]        -0.079843   0.052963  -1.508  0.13170    
## pol_durationG(25,30]         0.120919   0.052681   2.295  0.02173 *  
## pol_durationG(30,39]        -0.217601   0.146913  -1.481  0.13859    
## pol_coverageGMedian         -0.780744   0.033137 -23.561  < 2e-16 ***
## pol_coverageGMini           -0.768901   0.064257 -11.966  < 2e-16 ***
## vh_typeTourism              -0.165298   0.051268  -3.224  0.00127 ** 
## vh_fuelGasoline              0.019220   0.033451   0.575  0.56559    
## vh_fuelHybrid                0.191175   0.424205   0.451  0.65224    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.581 on 12992 degrees of freedom
## Multiple R-squared:  0.06464,    Adjusted R-squared:  0.06241 
## F-statistic: 28.96 on 31 and 12992 DF,  p-value: < 2.2e-16
formula <-  claim_amount ~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel

reg.gamma <- glm(formula,family=Gamma(link="log"),data=df[df$claim_amount<15000,])#focus on subset 
summary(reg.gamma)
## 
## Call:
## glm(formula = formula, family = Gamma(link = "log"), data = df[df$claim_amount < 
##     15000, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7930  -1.5489  -0.8305   0.1299   6.1798  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  6.632420   0.233736  28.376  < 2e-16 ***
## drv_age2G(25,45]            -0.185768   0.130835  -1.420  0.15567    
## drv_age2G(45,103]           -0.076800   0.130499  -0.589  0.55620    
## vh_ageG(10,25]              -0.208069   0.034931  -5.957 2.64e-09 ***
## vh_ageG(25,30]               0.324265   0.234872   1.381  0.16742    
## vh_ageG(30,100]             -0.076819   0.326417  -0.235  0.81395    
## vh_valueG(8e+03,1.5e+04]     0.063366   0.187388   0.338  0.73525    
## vh_valueG(1.5e+04,2.2e+04]   0.131967   0.189850   0.695  0.48700    
## vh_valueG(2.2e+04,1.32e+05]  0.141246   0.191145   0.739  0.45995    
## vh_dinG(50,220]              0.394551   0.170250   2.317  0.02049 *  
## vh_dinG(220,507]             0.587790   0.219085   2.683  0.00731 ** 
## vh_makeGB                   -0.022290   0.053888  -0.414  0.67915    
## vh_makeGC                    0.078971   0.056474   1.398  0.16203    
## vh_makeGD                    0.124002   0.094004   1.319  0.18715    
## vh_makeGE                    0.157463   0.071254   2.210  0.02713 *  
## vh_makeGG                    0.041687   0.034328   1.214  0.22463    
## densityG(40,200]             0.037201   0.067908   0.548  0.58383    
## densityG(200,500]            0.084991   0.072770   1.168  0.24286    
## densityG(500,4.5e+03]        0.029857   0.082703   0.361  0.71810    
## densityG(4.5e+03,Inf]        0.092881   0.082039   1.132  0.25759    
## pol_bonusG(0.9,1.2]          0.199666   0.113200   1.764  0.07778 .  
## pol_bonusG(1.2,1.4]          0.454897   0.633061   0.719  0.47242    
## pol_bonusG(1.4,1.56]         0.481863   0.834567   0.577  0.56369    
## pol_durationG(10,20]         0.008678   0.035061   0.248  0.80452    
## pol_durationG(20,25]        -0.039237   0.055956  -0.701  0.48318    
## pol_durationG(25,30]         0.125322   0.055630   2.253  0.02429 *  
## pol_durationG(30,39]        -0.425333   0.158234  -2.688  0.00720 ** 
## pol_coverageGMedian         -0.462150   0.034975 -13.214  < 2e-16 ***
## pol_coverageGMini           -0.490353   0.067748  -7.238 4.81e-13 ***
## vh_typeTourism              -0.109370   0.054166  -2.019  0.04349 *  
## vh_fuelGasoline              0.044157   0.035325   1.250  0.21131    
## vh_fuelHybrid               -0.144775   0.447207  -0.324  0.74615    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 2.776586)
## 
##     Null deviance: 27685  on 12989  degrees of freedom
## Residual deviance: 26758  on 12958  degrees of freedom
## AIC: 201470
## 
## Number of Fisher Scoring iterations: 9
reg.invgaus <- glm(formula ,family=inverse.gaussian(link="1/mu^2"),data=df[df$claim_amount<15000,])
summary(reg.invgaus)
## 
## Call:
## glm(formula = formula, family = inverse.gaussian(link = "1/mu^2"), 
##     data = df[df$claim_amount < 15000, ])
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.76730  -0.08070  -0.03203   0.00425   0.17955  
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.780e-06  8.381e-07   3.317 0.000913 ***
## drv_age2G(25,45]             2.200e-07  2.112e-07   1.042 0.297530    
## drv_age2G(45,103]            5.698e-09  2.059e-07   0.028 0.977926    
## vh_ageG(10,25]               5.028e-07  9.085e-08   5.534 3.20e-08 ***
## vh_ageG(25,30]              -2.876e-07  1.702e-07  -1.690 0.091051 .  
## vh_ageG(30,100]             -9.342e-08  7.689e-07  -0.121 0.903301    
## vh_valueG(8e+03,1.5e+04]     1.635e-08  5.293e-07   0.031 0.975362    
## vh_valueG(1.5e+04,2.2e+04]  -1.265e-07  5.310e-07  -0.238 0.811790    
## vh_valueG(2.2e+04,1.32e+05] -1.421e-07  5.324e-07  -0.267 0.789563    
## vh_dinG(50,220]             -2.005e-06  7.906e-07  -2.536 0.011223 *  
## vh_dinG(220,507]            -2.182e-06  8.108e-07  -2.691 0.007123 ** 
## vh_makeGB                    7.423e-08  1.235e-07   0.601 0.547831    
## vh_makeGC                   -1.122e-07  1.111e-07  -1.010 0.312541    
## vh_makeGD                   -1.804e-07  1.758e-07  -1.026 0.304706    
## vh_makeGE                   -1.981e-07  1.133e-07  -1.749 0.080300 .  
## vh_makeGG                   -7.261e-08  7.167e-08  -1.013 0.311033    
## densityG(40,200]            -5.709e-08  1.540e-07  -0.371 0.710888    
## densityG(200,500]           -1.337e-07  1.615e-07  -0.828 0.407585    
## densityG(500,4.5e+03]       -6.665e-08  1.811e-07  -0.368 0.712835    
## densityG(4.5e+03,Inf]       -1.279e-07  1.741e-07  -0.735 0.462630    
## pol_bonusG(0.9,1.2]         -2.992e-07  1.916e-07  -1.562 0.118422    
## pol_bonusG(1.2,1.4]         -4.141e-07  6.611e-07  -0.626 0.531048    
## pol_bonusG(1.4,1.56]        -6.911e-07  9.208e-07  -0.751 0.452934    
## pol_durationG(10,20]        -1.023e-08  7.166e-08  -0.143 0.886522    
## pol_durationG(20,25]         6.808e-08  1.204e-07   0.565 0.571819    
## pol_durationG(25,30]        -1.879e-07  9.129e-08  -2.058 0.039581 *  
## pol_durationG(30,39]         1.585e-06  7.308e-07   2.169 0.030135 *  
## pol_coverageGMedian          1.355e-06  1.248e-07  10.856  < 2e-16 ***
## pol_coverageGMini            1.382e-06  2.610e-07   5.294 1.22e-07 ***
## vh_typeTourism               1.817e-07  9.471e-08   1.919 0.055042 .  
## vh_fuelGasoline             -6.639e-08  7.330e-08  -0.906 0.365140    
## vh_fuelHybrid                1.771e-07  9.382e-07   0.189 0.850268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.003277493)
## 
##     Null deviance: 165.75  on 12989  degrees of freedom
## Residual deviance: 164.70  on 12958  degrees of freedom
## AIC: 206785
## 
## Number of Fisher Scoring iterations: 10
mean(df[df$claim_amount<15000, 'claim_amount']) #empirical mean
## [1] 972.7987
sigma <- summary(reg.logn)$sigma
mean(exp(predict(reg.logn))*exp(sigma^2/2)) #mean for log normal
## [1] 1278.154
mean(predict(reg.gamma,type="response")) #mean gamma
## [1] 971.0181
mean(predict(reg.invgaus,type="response"))# mean inverse gaussian
## [1] 972.8024

Differentiating large claims

ordered_claim <- df[order(-df$claim_amount),c("claim_amount","drv_age1","vh_age","vh_value","vh_fuel","density")]
ordered_claim$aggamount <-cumsum(ordered_claim$claim_amount)/sum(ordered_claim$claim_amount)*100
ordered_claim$aggnb <- (1:length(df$claim_amount))/length(df$claim_amount) * 100

#ordered_claim[ordered_claim$aggnb >= 2.5,c('claim_amount','aggamount', 'aggnb')] # 57 sinistres comptent pour 10% des montants totaux ! 

print(tail(ordered_claim[ordered_claim$claim_amount >= 6000,c('claim_amount','aggamount', 'aggnb')]))
##       claim_amount aggamount    aggnb
## 86871      6050.00  25.33399 2.426290
## 62114      6047.08  25.37817 2.433968
## 46670      6039.41  25.42230 2.441646
## 33169      6034.74  25.46639 2.449324
## 54947      6024.31  25.51041 2.457002
## 28101      6002.17  25.55427 2.464681

Nous avons constaté par une étude descriptive que seulement 5% des sinsitres représentaient près de 40% de la charge sinistre totale. Nous choisissons un niveau de séparation de 6000€. Les sinistres supérieurs à ce montant représentent alors 2.46% des sinistres de la base ainsi que 25.5% de la charge sinistre totale.

Ecrêtement

u <- 6000 #valeur de séparation


su<- sum(ifelse(df$claim_amount - u > 0, df$claim_amount - u, 0))/length(df$claim_amount) #charge surcrête moyenne
df$surcrete <- ifelse (df$claim_amount<u, df$claim_amount, u) + su

summary(df$surcrete)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   121.0   236.6   485.2  1050.8  1238.2  6120.7

Nous pouvons maintenant proposer une modélisation pour les nouveaux montants encrếtés.

formula <-  surcrete ~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel

reg.logn <- lm(log(surcrete)~drv_age1 + vh_ageG +vh_value + pol_durationG ,data=df)
summary(reg.logn)
## 
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_value + 
##     pol_durationG, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0901 -0.8728 -0.1670  0.7576  2.8294 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.961e+00  4.184e-02 142.481  < 2e-16 ***
## drv_age1              3.502e-03  6.264e-04   5.591 2.30e-08 ***
## vh_ageG(10,25]       -1.104e-01  2.121e-02  -5.207 1.95e-07 ***
## vh_ageG(25,30]        3.490e-01  1.445e-01   2.415 0.015754 *  
## vh_ageG(30,100]      -9.108e-02  1.891e-01  -0.482 0.630161    
## vh_value              1.072e-05  1.059e-06  10.124  < 2e-16 ***
## pol_durationG(10,20]  6.368e-02  2.191e-02   2.907 0.003656 ** 
## pol_durationG(20,25] -2.462e-02  3.499e-02  -0.704 0.481614    
## pol_durationG(25,30]  1.299e-01  3.475e-02   3.737 0.000187 ***
## pol_durationG(30,39] -1.819e-01  9.730e-02  -1.870 0.061562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.048 on 13014 degrees of freedom
## Multiple R-squared:  0.01672,    Adjusted R-squared:  0.01604 
## F-statistic: 24.59 on 9 and 13014 DF,  p-value: < 2.2e-16
reg.gamma <- glm(surcrete~drv_age1 + vh_ageG +vh_value + pol_durationG  ,family=Gamma(link="log"),data=df)
summary(reg.gamma)
## 
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_value + pol_durationG, 
##     family = Gamma(link = "log"), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8244  -1.1831  -0.6736   0.1699   3.1478  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.672e+00  5.109e-02 130.601  < 2e-16 ***
## drv_age1              2.441e-03  7.649e-04   3.192 0.001416 ** 
## vh_ageG(10,25]       -1.535e-01  2.590e-02  -5.927 3.15e-09 ***
## vh_ageG(25,30]        3.837e-01  1.764e-01   2.174 0.029687 *  
## vh_ageG(30,100]      -6.913e-02  2.310e-01  -0.299 0.764711    
## vh_value              8.183e-06  1.293e-06   6.327 2.58e-10 ***
## pol_durationG(10,20]  4.566e-02  2.675e-02   1.707 0.087858 .  
## pol_durationG(20,25] -2.740e-03  4.272e-02  -0.064 0.948860    
## pol_durationG(25,30]  1.608e-01  4.243e-02   3.791 0.000151 ***
## pol_durationG(30,39] -4.279e-02  1.188e-01  -0.360 0.718738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.637889)
## 
##     Null deviance: 15664  on 13023  degrees of freedom
## Residual deviance: 15453  on 13014  degrees of freedom
## AIC: 207252
## 
## Number of Fisher Scoring iterations: 6
reg.invgaus <- glm(surcrete~drv_age1 + vh_ageG +vh_value + pol_durationG  ,family=inverse.gaussian(link="1/mu^2"),data=df)
summary(reg.invgaus)
## 
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_value + pol_durationG, 
##     family = inverse.gaussian(link = "1/mu^2"), data = df)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.082251  -0.049925  -0.024162   0.005089   0.084602  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.313e-06  8.245e-08  15.923  < 2e-16 ***
## drv_age1             -4.671e-09  1.373e-09  -3.402  0.00067 ***
## vh_ageG(10,25]        3.236e-07  5.422e-08   5.969 2.44e-09 ***
## vh_ageG(25,30]       -4.566e-07  1.661e-07  -2.748  0.00600 ** 
## vh_ageG(30,100]       2.312e-07  5.160e-07   0.448  0.65406    
## vh_value             -7.987e-12  2.944e-13 -27.132  < 2e-16 ***
## pol_durationG(10,20] -8.678e-08  4.736e-08  -1.832  0.06693 .  
## pol_durationG(20,25] -1.111e-09  7.924e-08  -0.014  0.98881    
## pol_durationG(25,30] -2.620e-07  6.349e-08  -4.127 3.69e-05 ***
## pol_durationG(30,39]  1.604e-07  2.526e-07   0.635  0.52552    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.001594935)
## 
##     Null deviance: 22.871  on 13023  degrees of freedom
## Residual deviance: 22.682  on 13014  degrees of freedom
## AIC: 202582
## 
## Number of Fisher Scoring iterations: 11

Modèles écrêtement et sélection de variables :

Nous optons pour un modèle par écrêtement par souci de simplicité par la suite. Nous allons dans cette partie procéder à une sélection de variables par Stepwise AIC et comparer les modèles obtenue en sortie.

Modèle Gamma

Backward Selection

reg.gamma.full <- glm(surcrete~.,family=Gamma(link="log"),data=subset(df, select= -c(claim_amount))) #do not keep claim amount ! 
gamma_backward <- step(reg.gamma.full, direction = 'backward')
summary(gamma_backward)

Forward Selection

reg.gamma.zero <- glm(surcrete~1,family=Gamma(link="log"),data=na.omit(subset(df, select= -c(claim_amount)))) #do not keep claim amount ! 

gamma_forward <- step(reg.gamma.zero, scope = list(lower=formula(reg.gamma.zero),upper=formula(reg.gamma.full)), direction = 'forward')

summary(gamma_forward)

Stepwise selection full model

gamma_stepwise <- step(reg.gamma.full, direction = 'both')

summary(gamma_stepwise)

Nous comparons les résultats de ces 3 modèles : en particulier, nous remarquerons que les variables sélectionnées sont régulièrement les mêmes, peu importe la méthode. Finalement, nous retiendrons la méthode stepwise qui fournit des résultats similaires à la méthode backward mais semble plus précautionneuse. Nous devons tout de même relever que les améliorations sur l’AIC sont marginales ! Les variables retenues sont souvent assez similaires. Nous remarquons que certaines variables peuvent êtres présents à la fois en groupe et en continue, nous conservons manuellement l’une d’entre elle seulement.

Stepwise for Lognormal and Inverse Gaussian

Inverse Gaussian stepwise

reg.invgaus.full <- glm(surcrete~. ,family=inverse.gaussian(link="1/mu^2"),data=na.omit(subset(df, select=-c(claim_amount))))

invgaus_stepwise <- step(reg.invgaus.full, direction = 'both')
#summary(reg.invgaus)
summary(invgaus_stepwise)
## 
## Call:
## glm(formula = surcrete ~ drv_age2G + vh_ageG + vh_din + vh_dinG + 
##     vh_type + pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "1/mu^2"), 
##     data = na.omit(subset(df, select = -c(claim_amount))))
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.082577  -0.049352  -0.023877   0.004786   0.104212  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.008e-06  4.765e-07   4.214 2.52e-05 ***
## drv_age2G(25,45]      2.135e-07  1.610e-07   1.326  0.18486    
## drv_age2G(45,103]     5.207e-08  1.636e-07   0.318  0.75036    
## vh_ageG(10,25]        2.871e-07  5.511e-08   5.210 1.92e-07 ***
## vh_ageG(25,30]       -3.344e-07  1.316e-07  -2.541  0.01108 *  
## vh_ageG(30,100]      -1.345e-07  4.631e-07  -0.291  0.77142    
## vh_din               -2.018e-09  4.272e-10  -4.723 2.34e-06 ***
## vh_dinG(50,220]      -1.134e-06  4.061e-07  -2.792  0.00525 ** 
## vh_dinG(220,507]     -1.014e-06  4.464e-07  -2.271  0.02318 *  
## vh_typeTourism        1.945e-07  6.172e-08   3.152  0.00163 ** 
## pol_bonus            -4.294e-07  2.145e-07  -2.001  0.04538 *  
## pol_durationG(10,20] -4.861e-08  4.785e-08  -1.016  0.30977    
## pol_durationG(20,25]  3.783e-08  8.041e-08   0.470  0.63806    
## pol_durationG(25,30] -1.806e-07  6.105e-08  -2.959  0.00310 ** 
## pol_durationG(30,39]  1.316e-07  2.559e-07   0.514  0.60711    
## pol_coverageGMedian   9.378e-07  7.461e-08  12.570  < 2e-16 ***
## pol_coverageGMini     9.117e-07  1.523e-07   5.986 2.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.001741437)
## 
##     Null deviance: 22.871  on 13023  degrees of freedom
## Residual deviance: 22.218  on 13007  degrees of freedom
## AIC: 202327
## 
## Number of Fisher Scoring iterations: 9

lognormal stepwise

reg.logn.full <- lm(log(surcrete) ~. , data=na.omit(subset(df, select=-c(claim_amount))))
logn_stepwise <- step(reg.logn.full, direction = 'both')
summary(logn_stepwise)
## 
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + drv_age2G + vh_ageG + 
##     vh_age + vh_valueG + vh_value + vh_din + vh_dinG + vh_type + 
##     densityG + pol_bonus + pol_durationG + pol_coverageG, data = na.omit(subset(df, 
##     select = -c(claim_amount))))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1426 -0.8344 -0.1797  0.7235  2.8935 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.852e+00  1.732e-01  33.781  < 2e-16 ***
## drv_age1                     2.125e-03  9.575e-04   2.219 0.026478 *  
## drv_age2G(25,45]            -1.812e-01  8.251e-02  -2.196 0.028130 *  
## drv_age2G(45,103]           -1.258e-01  8.951e-02  -1.405 0.160050    
## vh_ageG(10,25]              -1.843e-01  3.512e-02  -5.247 1.57e-07 ***
## vh_ageG(25,30]               1.545e-01  1.573e-01   0.982 0.325989    
## vh_ageG(30,100]             -2.496e-01  2.189e-01  -1.140 0.254170    
## vh_age                       8.244e-03  3.066e-03   2.689 0.007183 ** 
## vh_valueG(8e+03,1.5e+04]    -9.488e-02  1.148e-01  -0.826 0.408660    
## vh_valueG(1.5e+04,2.2e+04]  -8.695e-02  1.176e-01  -0.739 0.459858    
## vh_valueG(2.2e+04,1.32e+05] -1.807e-01  1.242e-01  -1.455 0.145704    
## vh_value                     7.421e-06  2.790e-06   2.660 0.007825 ** 
## vh_din                       1.423e-03  6.141e-04   2.317 0.020521 *  
## vh_dinG(50,220]              3.964e-01  1.061e-01   3.737 0.000187 ***
## vh_dinG(220,507]             3.717e-01  1.560e-01   2.383 0.017190 *  
## vh_typeTourism              -1.213e-01  3.337e-02  -3.636 0.000278 ***
## densityG(40,200]             3.060e-02  4.183e-02   0.732 0.464435    
## densityG(200,500]            8.379e-02  4.482e-02   1.869 0.061587 .  
## densityG(500,4.5e+03]        1.541e-02  5.093e-02   0.303 0.762224    
## densityG(4.5e+03,Inf]        8.233e-02  5.053e-02   1.629 0.103284    
## pol_bonus                    2.531e-01  1.015e-01   2.493 0.012672 *  
## pol_durationG(10,20]         4.730e-02  2.181e-02   2.169 0.030120 *  
## pol_durationG(20,25]        -4.770e-02  3.459e-02  -1.379 0.167958    
## pol_durationG(25,30]         9.616e-02  3.441e-02   2.795 0.005200 ** 
## pol_durationG(30,39]        -1.717e-01  9.549e-02  -1.798 0.072242 .  
## pol_coverageGMedian         -4.441e-01  2.154e-02 -20.622  < 2e-16 ***
## pol_coverageGMini           -4.307e-01  4.173e-02 -10.321  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.027 on 12997 degrees of freedom
## Multiple R-squared:  0.05742,    Adjusted R-squared:  0.05553 
## F-statistic: 30.45 on 26 and 12997 DF,  p-value: < 2.2e-16

Compare Models with same variables

Lognormal

reg.logn <- lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_dinG +
    vh_type + pol_bonus + pol_durationG + pol_coverageG, data =df)

summary(reg.logn)
## 
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_dinG + vh_type + 
##     pol_bonus + pol_durationG + pol_coverageG, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0184 -0.8427 -0.1696  0.7294  2.8528 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.8292768  0.1195391  48.765  < 2e-16 ***
## drv_age1              0.0031412  0.0006584   4.771 1.85e-06 ***
## vh_ageG(10,25]       -0.1328789  0.0209189  -6.352 2.19e-10 ***
## vh_ageG(25,30]        0.3169738  0.1430571   2.216 0.026728 *  
## vh_ageG(30,100]      -0.0209880  0.1942372  -0.108 0.913955    
## vh_dinG(50,220]       0.4307782  0.0886671   4.858 1.20e-06 ***
## vh_dinG(220,507]      0.8418149  0.1210098   6.957 3.65e-12 ***
## vh_typeTourism       -0.1140722  0.0322895  -3.533 0.000413 ***
## pol_bonus             0.3297129  0.0944027   3.493 0.000480 ***
## pol_durationG(10,20]  0.0466793  0.0218504   2.136 0.032672 *  
## pol_durationG(20,25] -0.0506218  0.0346514  -1.461 0.144071    
## pol_durationG(25,30]  0.0952796  0.0344595   2.765 0.005701 ** 
## pol_durationG(30,39] -0.1681721  0.0956966  -1.757 0.078882 .  
## pol_coverageGMedian  -0.4527098  0.0215025 -21.054  < 2e-16 ***
## pol_coverageGMini    -0.4343161  0.0417423 -10.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.03 on 13009 degrees of freedom
## Multiple R-squared:  0.05128,    Adjusted R-squared:  0.05026 
## F-statistic: 50.23 on 14 and 13009 DF,  p-value: < 2.2e-16
perf_model <- function(model){
  return(list("AIC"= AIC(model), "BIC"= BIC(model), "log-vraisemblance" = logLik(model), "Deviance" =model$deviance, "Null dev." = model$null.deviance))
}

list("AIC"= AIC(reg.logn), "BIC"= BIC(reg.logn), "log-vraisemblance" = logLik(reg.logn))
## $AIC
## [1] 37740.71
## 
## $BIC
## [1] 37860.3
## 
## $`log-vraisemblance`
## 'log Lik.' -18854.35 (df=16)

Gamma

reg.gamma <- glm(surcrete~ drv_age1 + vh_ageG + vh_dinG +
    vh_type + pol_bonus + pol_durationG + pol_coverageG ,family=Gamma(link="log"),data=df)

perf_model(reg.gamma)
## $AIC
## [1] 206876.6
## 
## $BIC
## [1] 206996.2
## 
## $`log-vraisemblance`
## 'log Lik.' -103422.3 (df=16)
## 
## $Deviance
## [1] 15068.94
## 
## $`Null dev.`
## [1] 15664.3

Log gamma

reg.loggamma <- glm(log(surcrete)~ drv_age1 + vh_ageG + vh_dinG +
    vh_type + pol_bonus + pol_durationG + pol_coverageG ,family=Gamma(link='identity'),data=df)
perf_model(reg.loggamma)
## $AIC
## [1] 37056.46
## 
## $BIC
## [1] 37176.06
## 
## $`log-vraisemblance`
## 'log Lik.' -18512.23 (df=16)
## 
## $Deviance
## [1] 331.389
## 
## $`Null dev.`
## [1] 349.8797

Inverse Gaussian

reg.invgaus <- glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_dinG +
    vh_type + pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "log"), start=coefficients(reg.gamma), data = subset(df, select = -c(claim_amount)))
summary(reg.invgaus)
## 
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_dinG + vh_type + 
##     pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "log"), 
##     data = subset(df, select = -c(claim_amount)), start = coefficients(reg.gamma))
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.081591  -0.049556  -0.023912   0.004855   0.105755  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.5249942  0.1405852  46.413  < 2e-16 ***
## drv_age1              0.0021746  0.0008516   2.553 0.010678 *  
## vh_ageG(10,25]       -0.1053311  0.0261517  -4.028 5.66e-05 ***
## vh_ageG(25,30]        0.3270079  0.2159604   1.514 0.129999    
## vh_ageG(30,100]      -0.1090480  0.2202828  -0.495 0.620583    
## vh_dinG(50,220]       0.3274655  0.0948179   3.454 0.000555 ***
## vh_dinG(220,507]      0.6290825  0.1574529   3.995 6.49e-05 ***
## vh_typeTourism       -0.0853282  0.0425474  -2.005 0.044932 *  
## pol_bonus             0.3051741  0.1217959   2.506 0.012236 *  
## pol_durationG(10,20]  0.0333220  0.0280969   1.186 0.235656    
## pol_durationG(20,25] -0.0334847  0.0437277  -0.766 0.443837    
## pol_durationG(25,30]  0.1315066  0.0468801   2.805 0.005036 ** 
## pol_durationG(30,39] -0.0304020  0.1189563  -0.256 0.798285    
## pol_coverageGMedian  -0.3948936  0.0254549 -15.513  < 2e-16 ***
## pol_coverageGMini    -0.3931200  0.0476602  -8.248  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.001723602)
## 
##     Null deviance: 22.871  on 13023  degrees of freedom
## Residual deviance: 22.292  on 13009  degrees of freedom
## AIC: 202366
## 
## Number of Fisher Scoring iterations: 5
perf_model(reg.invgaus)
## $AIC
## [1] 202366.2
## 
## $BIC
## [1] 202485.8
## 
## $`log-vraisemblance`
## 'log Lik.' -101167.1 (df=16)
## 
## $Deviance
## [1] 22.29224
## 
## $`Null dev.`
## [1] 22.87138
#df$prediction<-p

Residuals study

plotgroupresiduals <- function(object, m=100, trim=TRUE, main=NULL, ...)
{
  yh <- fitted.values(object)
  re <- residuals(object)
  if(trim)
    ind <- abs(re) <= quantile(abs(re), probs=.99) & abs(yh) <= quantile(abs(yh), probs=.99)
  else
    ind <- 1:length(re)
  yh <- yh[ind]
  re <- re[ind]
  
  n <- length(yh)
  ind <- sample(1:n, n)
  yh <- yh[ind]
  re <- re[ind]
  
  #group
  if(m > 1)
  {
    yhg <- rowMeans(matrix(yh, ncol=m))
    reg <- rowMeans(matrix(re, ncol=m))
    plot(yhg, reg, ylab = "Group residuals", xlab="Group fitted values", main=main, ...)
  }else
    plot(yh, re, ylab = "Residuals", xlab="Fitted values", main=main, ...)
  abline(h=0, lty=3, col="grey")
}
par(mfrow=c(2,2))
plotgroupresiduals(reg.invgaus, m=1, main = "Inverse Gaussian")
plotgroupresiduals(reg.logn, m=1, main ="Lognormal")
plotgroupresiduals(reg.gamma, m=1, main = "Gamma")
plotgroupresiduals(reg.loggamma, m=1, main = 'Loggamma')# Pas besoin de paquet car ccompare des résidus à des prédictions continues 

## Mean prediction by classes

df$prediction <- predict(reg.invgaus, type ='response')
df$drv_age2G <- cut(df$drv_age1, c(17, 25, 45,100))
fig1 <-  plot_ly(data =df, x = ~drv_age2G, y = ~claim_amount, type = "box", name = 'Boxplot Observed values') %>% layout(xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))

fig2 <- plot_ly(data =df, x = ~drv_age2G, y = ~prediction, type = "box", name = 'Boxplot Inverse gaussian') %>% layout(xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))

fig <- subplot(fig1, fig2) %>% layout(title = "Boxplot des espérance prédites par classes d'âge")
## Warning: Ignoring 1 observations

## Warning: Ignoring 1 observations
fig

New approach: age separation

Par ailleurs, nous pouvons également envisager une méthode de séparation plutôt que la méthode d’écrêtement implémentée ci-dessus. Nous pourrions classer les profils de risque selon leur probabilité d’avoir des montants de sinistres élevés ou standard avec les variables explicatives à disposition. Nous montrons ci-dessous un exemple réalisé à partir de l’âge des conducteurs.

u <- 6000
df$standard <- (df$claim_amount<u) # 1 or TRUE if amount is less than 6000, our target 
age <- seq(18,85)


regC <- glm(standard~bs(drv_age1),data=df,family=binomial) #https://towardsdatascience.com/from-logistic-regression-to-basis-expansions-and-splines-74d6bb3b8dc6
#to understand bsplines, often used in logreg when non obvious linear relation
ypC <- predict(regC,newdata=data.frame(drv_age1=age),type="response",
se=TRUE) #predict proba for every age
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
plot(age,ypC$fit,ylim=c(.95,1),type="l",)
polygon(c(age,rev(age)),c(ypC$fit+2*ypC$se.fit,rev(ypC$fit-2*ypC$se.fit)),
col="grey",border=NA)
abline(h=mean(df$standard),lty=2)

Probabilité d’avoir un sinistre standard, étant donné qu’un sinistre est survenu, en fonction de l’âge du conducteur. l’âge du conducteur. Régression logistique avec un lisseur spline.

Nous réalisons maintenant deux modèles pour chaque type de sinistres (standard ou supérieux à 6000€).

indexstandard <- which(df$claim_amount<u)
mean(df$claim_amount[indexstandard])
## [1] 802.0671
mean(df$claim_amount[-indexstandard])
## [1] 10895.21
regA <- glm(claim_amount~bs(drv_age1),data=df[indexstandard,], family=Gamma(link="log"))
summary(regA)
## 
## Call:
## glm(formula = claim_amount ~ bs(drv_age1), family = Gamma(link = "log"), 
##     data = df[indexstandard, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6888  -1.4842  -0.7335   0.2496   3.1197  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.80797    0.09699  70.190   <2e-16 ***
## bs(drv_age1)1 -0.55370    0.26677  -2.076    0.038 *  
## bs(drv_age1)2  0.30443    0.18572   1.639    0.101    
## bs(drv_age1)3 -0.09539    0.29746  -0.321    0.748    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.844181)
## 
##     Null deviance: 24011  on 12702  degrees of freedom
## Residual deviance: 23958  on 12699  degrees of freedom
## AIC: 193703
## 
## Number of Fisher Scoring iterations: 7
ypA <- predict(regA,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
regB <- glm(claim_amount~bs(drv_age1),data=df[-indexstandard,],family=Gamma(link="log"))
summary(regB)
## 
## Call:
## glm(formula = claim_amount ~ bs(drv_age1), family = Gamma(link = "log"), 
##     data = df[-indexstandard, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6351  -0.4139  -0.2509   0.0201   6.5509  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.53233    0.65599  13.007   <2e-16 ***
## bs(drv_age1)1  1.96836    1.53321   1.284    0.200    
## bs(drv_age1)2 -0.08371    0.77285  -0.108    0.914    
## bs(drv_age1)3  0.82435    1.04543   0.789    0.431    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 2.132098)
## 
##     Null deviance: 105.40  on 320  degrees of freedom
## Residual deviance:  99.62  on 317  degrees of freedom
## AIC: 6424.6
## 
## Number of Fisher Scoring iterations: 8
ypB <- predict(regB,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(20L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
reg <- glm(claim_amount~bs(drv_age1),data=df,family=Gamma(link="log"))
yp <- predict(reg,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
ypC <- predict(regC,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
plot(age,yp,type="l",lwd=2,ylab="Average cost",xlab="Age of the driver")
lines(age,ypC*ypA+(1-ypC)*ypB,type="h",col="grey",lwd=6) 
#expliquer la formule : prix moyen : proba de standard * prix standard + proba large montant * prix large montant

lines(age,ypC*ypA,type="h",col="black",lwd=6)
abline(h= mean(df$claim_amount),lty=2)

La ligne horizontale représente le coût moyen d’un sinistre. La ligne sombre à l’arrière, est la prédiction sur l’ensemble de données. La partie sombre est la partie de la partie du sinistre moyen liée aux sinistres standard (inférieurs à s) et la zone plus claire est la part du sinistre moyen due à d’éventuels sinistres importants (supérieurs à s).